In [1]:
#Import necessary libraries
import json
from IPython.display import Image
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import plotly.express as px
import plotly.graph_objects as go
import streamlit as st
In [2]:
%load_ext autoreload
%autoreload 2
In [3]:
%matplotlib inline

Analyze Patent Information from Patent API¶

This project focuses on analyzing the patent information retrieved from the patentsview API. The PatentsView API is intended to inspire the exploration and enhanced understanding of US intellectual property (IP) and innovation systems.

The base endpoint url for the patent API is given as follows.

  • Patentsview (https://patentsview.org/apis/api-endpoints/patents)

This API version does not require any API key and has the following query parameters embedded to the url.

  • q - query paramters (Ex: patent_inv_country : "US")
  • f - JSON response fields (Ex: [patent_number", "patent_title", "patent_date"])
  • s - sort the results (Ex: "patent_number":"asc")
  • o - result options (Ex: "per_page": 10000)

The max size of results per page is limited to 10000 in this API version.

Retrieving Patent Data from the API:¶

Steps followed in the data retrieval process are mentioned as follows.

  • Set the response fields list to the required patent fields. The detailed data description is available in https://patentsview.org/apis/api-endpoints/patents - Patent Field List.
  • Set the required query parameters. Here, the inventor and assignee countries are filtered for US.
  • Set max results to 10000.
  • Sort the results by patent_number.
In [4]:
#List of response fields
response_fields_list = [ "patent_firstnamed_assignee_city", "patent_firstnamed_assignee_country",
                        "patent_firstnamed_assignee_id", "patent_firstnamed_assignee_latitude", 
                        "patent_firstnamed_assignee_longitude","patent_firstnamed_assignee_state", 
                        "patent_firstnamed_inventor_city", "patent_firstnamed_inventor_country",
                        "patent_firstnamed_inventor_id", "patent_firstnamed_inventor_latitude", 
                        "patent_firstnamed_inventor_longitude", "patent_firstnamed_inventor_state",
                        "patent_num_cited_by_us_patents", "patent_number", "patent_title", 
                        "patent_type", "patent_year", "patent_date"]
In [5]:
#Filters for the results
patent_inv_country = "US"
patent_assignee_country = "US"


# Form the final url
base_url = 'https://api.patentsview.org/patents/query?q={"_gte":{'
parameter_1 = f'"patent_firstnamed_inventor_country":"{patent_inv_country}"'
parameter_2 = f'"patent_firstnamed_assignee_country":"{patent_assignee_country}"'
response_fields = '}}&f=['

url = base_url + parameter_1 + "," + parameter_2 + response_fields
final_url = url

for i in range(0, len(response_fields_list)):
    final_url = final_url + '"' + response_fields_list[i] + '",'
    #print(final_url)
final_url = final_url[:-1:]
final_url = final_url + ']' + '&o={"per_page": 10000} &s=[{"patent_number":"asc"}]'

print(final_url)

payload={}

response = requests.request("GET", final_url, data=payload)

#print(response.text)
https://api.patentsview.org/patents/query?q={"_gte":{"patent_firstnamed_inventor_country":"US","patent_firstnamed_assignee_country":"US"}}&f=["patent_firstnamed_assignee_city","patent_firstnamed_assignee_country","patent_firstnamed_assignee_id","patent_firstnamed_assignee_latitude","patent_firstnamed_assignee_longitude","patent_firstnamed_assignee_state","patent_firstnamed_inventor_city","patent_firstnamed_inventor_country","patent_firstnamed_inventor_id","patent_firstnamed_inventor_latitude","patent_firstnamed_inventor_longitude","patent_firstnamed_inventor_state","patent_num_cited_by_us_patents","patent_number","patent_title","patent_type","patent_year","patent_date"]&o={"per_page": 10000} &s=[{"patent_number":"asc"}]
In [6]:
#Write the response to patent_details.json
with open('output/patent_details.json', 'w') as fout:
    fout.write(response.text)

Reading the json file back in.

In [7]:
patent_details_text = open('output/patent_details.json').read()

patent_details = json.loads(patent_details_text)
print(type(patent_details))
<class 'dict'>

Store the patent details to a dataframe

In [8]:
patent_details_df = pd.DataFrame(patent_details['patents'])
#Convert patent_date to date and extract patent month from patent date
patent_details_df['patent_date'] = pd.to_datetime(patent_details_df['patent_date'], format="%Y/%m/%d")
patent_details_df['patent_month'] = pd.DatetimeIndex(patent_details_df['patent_date']).month
print(patent_details_df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   patent_firstnamed_assignee_city       9049 non-null   object        
 1   patent_firstnamed_assignee_country    9106 non-null   object        
 2   patent_firstnamed_assignee_id         9125 non-null   object        
 3   patent_firstnamed_assignee_latitude   9063 non-null   object        
 4   patent_firstnamed_assignee_longitude  9063 non-null   object        
 5   patent_firstnamed_assignee_state      8391 non-null   object        
 6   patent_firstnamed_inventor_city       10000 non-null  object        
 7   patent_firstnamed_inventor_country    10000 non-null  object        
 8   patent_firstnamed_inventor_id         10000 non-null  object        
 9   patent_firstnamed_inventor_latitude   9936 non-null   object        
 10  patent_firstnamed_inventor_longitude  9936 non-null   object        
 11  patent_firstnamed_inventor_state      9983 non-null   object        
 12  patent_num_cited_by_us_patents        10000 non-null  object        
 13  patent_number                         10000 non-null  object        
 14  patent_title                          10000 non-null  object        
 15  patent_type                           10000 non-null  object        
 16  patent_year                           10000 non-null  object        
 17  patent_date                           10000 non-null  datetime64[ns]
 18  patent_month                          10000 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(17)
memory usage: 1.4+ MB
None
In [9]:
patent_details_df.head(5)
Out[9]:
patent_firstnamed_assignee_city patent_firstnamed_assignee_country patent_firstnamed_assignee_id patent_firstnamed_assignee_latitude patent_firstnamed_assignee_longitude patent_firstnamed_assignee_state patent_firstnamed_inventor_city patent_firstnamed_inventor_country patent_firstnamed_inventor_id patent_firstnamed_inventor_latitude patent_firstnamed_inventor_longitude patent_firstnamed_inventor_state patent_num_cited_by_us_patents patent_number patent_title patent_type patent_year patent_date patent_month
0 Waltham US e915981d-a0eb-4a13-a817-bc4597d35b98 42.3756 -71.2358 MA Manhattan Beach US fl:jo_ln:marron-5 33.8842 -118.426 CA 4 10000000 Coherent LADAR using intra-pixel quadrature de... utility 2018 2018-06-19 6
1 Brookfield US 0bf96d08-fe99-4bbb-ae0d-52bc76f5b051 43.0607 -88.1263 WI Brookfield US fl:co_ln:dickert-1 43.0607 -88.1263 WI 4 10000007 PEX expanding tool utility 2018 2018-06-19 6
2 Northvale US dc1b4faf-5315-40df-af2b-e688832e5286 41.0086 -73.9482 NJ Closter US fl:li_ln:caspi-2 40.9709 -73.969 NJ 0 10000008 Bracelet mold and method of use utility 2018 2018-06-19 6
3 None None None None None None Hayward US fl:na_ln:maier-3 37.6688 -122.081 CA 1 10000009 Sterile environment for additive manufacturing utility 2018 2018-06-19 6
4 Norwalk US 903324e5-5bbb-4187-8ee9-4f3291165199 41.0958 -73.4205 CT Rochester US fl:ro_ln:irizarry-1 43.1855 -77.6107 NY 0 10000010 3-D electrostatic printer using rack and pinio... utility 2018 2018-06-19 6

Data Cleaning¶

In [10]:
#Check for NaN values
patent_details_df.isna().any()
Out[10]:
patent_firstnamed_assignee_city          True
patent_firstnamed_assignee_country       True
patent_firstnamed_assignee_id            True
patent_firstnamed_assignee_latitude      True
patent_firstnamed_assignee_longitude     True
patent_firstnamed_assignee_state         True
patent_firstnamed_inventor_city         False
patent_firstnamed_inventor_country      False
patent_firstnamed_inventor_id           False
patent_firstnamed_inventor_latitude      True
patent_firstnamed_inventor_longitude     True
patent_firstnamed_inventor_state         True
patent_num_cited_by_us_patents          False
patent_number                           False
patent_title                            False
patent_type                             False
patent_year                             False
patent_date                             False
patent_month                            False
dtype: bool
In [11]:
#Drop rows with NaN values
patent_details_df.dropna(inplace = True)
In [12]:
patent_details_df.head(5)
Out[12]:
patent_firstnamed_assignee_city patent_firstnamed_assignee_country patent_firstnamed_assignee_id patent_firstnamed_assignee_latitude patent_firstnamed_assignee_longitude patent_firstnamed_assignee_state patent_firstnamed_inventor_city patent_firstnamed_inventor_country patent_firstnamed_inventor_id patent_firstnamed_inventor_latitude patent_firstnamed_inventor_longitude patent_firstnamed_inventor_state patent_num_cited_by_us_patents patent_number patent_title patent_type patent_year patent_date patent_month
0 Waltham US e915981d-a0eb-4a13-a817-bc4597d35b98 42.3756 -71.2358 MA Manhattan Beach US fl:jo_ln:marron-5 33.8842 -118.426 CA 4 10000000 Coherent LADAR using intra-pixel quadrature de... utility 2018 2018-06-19 6
1 Brookfield US 0bf96d08-fe99-4bbb-ae0d-52bc76f5b051 43.0607 -88.1263 WI Brookfield US fl:co_ln:dickert-1 43.0607 -88.1263 WI 4 10000007 PEX expanding tool utility 2018 2018-06-19 6
2 Northvale US dc1b4faf-5315-40df-af2b-e688832e5286 41.0086 -73.9482 NJ Closter US fl:li_ln:caspi-2 40.9709 -73.969 NJ 0 10000008 Bracelet mold and method of use utility 2018 2018-06-19 6
4 Norwalk US 903324e5-5bbb-4187-8ee9-4f3291165199 41.0958 -73.4205 CT Rochester US fl:ro_ln:irizarry-1 43.1855 -77.6107 NY 0 10000010 3-D electrostatic printer using rack and pinio... utility 2018 2018-06-19 6
5 Watertown US 94fabadd-7a5a-43dd-8afe-2e21db4d7481 42.372 -71.1754 MA Cambridge US fl:gr_ln:mark-2 42.3784 -71.1316 MA 0 10000011 Supports for sintering additively manufactured... utility 2018 2018-06-19 6

Quick groupby to compare patent inventors by state

In [13]:
compare_inv_by_state = patent_details_df.groupby('patent_firstnamed_inventor_state')['patent_firstnamed_inventor_id'].count().sort_values(ascending=False)

Quick groupby to compare patent assignees by state

In [14]:
compare_ass_by_state = patent_details_df.groupby('patent_firstnamed_assignee_state')['patent_firstnamed_assignee_id'].count().sort_values(ascending=False)
In [15]:
#Convert the comparison results to data frame
compare_inv_by_state_df = compare_inv_by_state.to_frame()
compare_ass_by_state_df = compare_ass_by_state.to_frame()
In [16]:
#Drop index and rename columns
compare_inv_by_state_df.reset_index(inplace = True)
compare_inv_by_state_df.rename(columns={"patent_firstnamed_inventor_state": "state", "patent_firstnamed_inventor_id": "no_of_inventors"}, inplace = True)
compare_inv_by_state_df
Out[16]:
state no_of_inventors
0 CA 2316
1 TX 601
2 NY 497
3 MA 439
4 WA 420
5 MI 407
6 IL 252
7 NJ 245
8 MN 234
9 OH 218
10 FL 211
11 PA 209
12 NC 184
13 OR 172
14 CO 170
15 CT 154
16 GA 142
17 WI 139
18 AZ 136
19 VA 125
20 IN 123
21 MD 109
22 UT 88
23 MO 73
24 SC 62
25 NH 61
26 ID 53
27 TN 52
28 KY 47
29 IA 44
30 KS 41
31 OK 40
32 AL 32
33 NV 23
34 NM 23
35 LA 21
36 RI 20
37 VT 17
38 NE 13
39 AR 13
40 ME 13
41 DE 12
42 DC 10
43 WV 7
44 WY 7
45 SD 6
46 ND 6
47 MS 6
48 MT 6
49 HI 5
50 PR 3
In [17]:
#Drop index and rename columns
compare_ass_by_state_df.reset_index(inplace = True)
compare_ass_by_state_df.rename(columns={"patent_firstnamed_assignee_state": "state", "patent_firstnamed_assignee_id": "no_of_assignees"}, inplace = True)
compare_ass_by_state_df
Out[17]:
state no_of_assignees
0 CA 2326
1 NY 751
2 TX 663
3 MA 427
4 MI 417
5 WA 343
6 IL 335
7 NJ 231
8 NC 224
9 CT 212
10 GA 207
11 MN 192
12 OH 182
13 PA 160
14 FL 154
15 CO 126
16 VA 105
17 IN 101
18 WI 95
19 AZ 94
20 MD 89
21 DE 83
22 MO 68
23 NV 65
24 DC 64
25 UT 62
26 TN 60
27 ID 53
28 OR 53
29 KS 47
30 NH 37
31 IA 37
32 SC 35
33 PR 30
34 KY 23
35 OK 22
36 NM 22
37 RI 18
38 AL 18
39 LA 18
40 AR 16
41 NE 11
42 VT 6
43 SD 6
44 WV 4
45 MT 4
46 HI 4
47 ND 2
48 ME 2
49 WY 2
50 MS 1
In [18]:
compare_by_state_df = compare_inv_by_state_df.merge(compare_ass_by_state_df, how='inner', on='state')
compare_by_state_df
Out[18]:
state no_of_inventors no_of_assignees
0 CA 2316 2326
1 TX 601 663
2 NY 497 751
3 MA 439 427
4 WA 420 343
5 MI 407 417
6 IL 252 335
7 NJ 245 231
8 MN 234 192
9 OH 218 182
10 FL 211 154
11 PA 209 160
12 NC 184 224
13 OR 172 53
14 CO 170 126
15 CT 154 212
16 GA 142 207
17 WI 139 95
18 AZ 136 94
19 VA 125 105
20 IN 123 101
21 MD 109 89
22 UT 88 62
23 MO 73 68
24 SC 62 35
25 NH 61 37
26 ID 53 53
27 TN 52 60
28 KY 47 23
29 IA 44 37
30 KS 41 47
31 OK 40 22
32 AL 32 18
33 NV 23 65
34 NM 23 22
35 LA 21 18
36 RI 20 18
37 VT 17 6
38 NE 13 11
39 AR 13 16
40 ME 13 2
41 DE 12 83
42 DC 10 64
43 WV 7 4
44 WY 7 2
45 SD 6 6
46 ND 6 2
47 MS 6 1
48 MT 6 4
49 HI 5 4
50 PR 3 30

From the above comparison, it is clear that the state of California has the highest number of patent inventors and patent assignees in this dataset.

Data Visualization¶

The patent information is further explored with different visualization packages like ipywidgets, plotly, Bokeh and Streamlit.

Visualizing patent data using ipywidgets¶

In [19]:
#Get the list of columns present in the patent_details_df dataframe
patent_data_cols = patent_details_df.columns.to_list()
patent_data_cols
Out[19]:
['patent_firstnamed_assignee_city',
 'patent_firstnamed_assignee_country',
 'patent_firstnamed_assignee_id',
 'patent_firstnamed_assignee_latitude',
 'patent_firstnamed_assignee_longitude',
 'patent_firstnamed_assignee_state',
 'patent_firstnamed_inventor_city',
 'patent_firstnamed_inventor_country',
 'patent_firstnamed_inventor_id',
 'patent_firstnamed_inventor_latitude',
 'patent_firstnamed_inventor_longitude',
 'patent_firstnamed_inventor_state',
 'patent_num_cited_by_us_patents',
 'patent_number',
 'patent_title',
 'patent_type',
 'patent_year',
 'patent_date',
 'patent_month']

Create the widget for the columns present in the dataframe. The options=data_cols will populate the dropdown options with our column names.

In [20]:
pick_col = widgets.Dropdown(
    options=patent_data_cols,
    description='Column',layout={'width': 'max-content'}
)
In [21]:
display(pick_col)
Dropdown(description='Column', layout=Layout(width='max-content'), options=('patent_firstnamed_assignee_city',…
In [22]:
patent_details_df['patent_firstnamed_inventor_city'].value_counts().size
Out[22]:
2003

Create a wrapper function that takes a data column from the dataframe and creates a countplot using seaborn.

In [23]:
def create_countplot(col):
    
    ax = sns.countplot(y=patent_details_df[col], data=patent_details_df,
                  order=patent_details_df[col].value_counts().iloc[:10].index)
    ax.set(xlabel='Number of patents');

Using interact function create user interface (UI) controls for exploring the patent data

In [24]:
widgets.interact(create_countplot, col=pick_col);
interactive(children=(Dropdown(description='Column', layout=Layout(width='max-content'), options=('patent_firs…

Create a wrapper function that takes number of categories to be displayed as input argument and creates a countplot of top inventor states using seaborn.

In [25]:
def countplot_top_inv_states(n):
    ax = sns.countplot(y=patent_details_df['patent_firstnamed_inventor_state'], data=patent_details_df,
                  order=patent_details_df['patent_firstnamed_inventor_state'].value_counts(ascending=False).iloc[:n].index)
    ax.set(xlabel='Number of patents')
    ax.set(ylabel='State');
In [26]:
style = {'description_width': 'initial'}

pick_n = widgets.IntSlider(
    value=1,
    min=2,
    max=15,
    step=1,
    style=style,
    description='Number of categories'
)
In [27]:
widgets.interact(countplot_top_inv_states, n=pick_n);
interactive(children=(IntSlider(value=2, description='Number of categories', max=15, min=2, style=SliderStyle(…

Plotly Visualizations¶

Scatter plot of number of inventors and no of assignees by state using plotly express

In [28]:
fig = px.scatter(compare_by_state_df, x="no_of_inventors", y="no_of_assignees", color="state")
fig.show()

Exploring the locations of patent inventors using scatter_geo map. The latitude and longitude values of the inventor are used in the plot.

In [39]:
fig = px.scatter_geo(patent_details_df,lat=patent_details_df['patent_firstnamed_inventor_latitude'],
                     lon=patent_details_df['patent_firstnamed_inventor_longitude'],
                     hover_name=patent_details_df['patent_firstnamed_inventor_city'],
                     hover_data=["patent_firstnamed_inventor_id", "patent_firstnamed_inventor_state"])
fig.update_layout(title = 'Exploring the locations of patent inventors - 2018', title_x=0.5)
fig.show()

Exploring the locations of patent assignees using scatter_geo map. The latitude and longitude values of the assignees are used in the plot.

In [30]:
fig = px.scatter_geo(patent_details_df,lat=patent_details_df['patent_firstnamed_assignee_latitude'],
                     lon=patent_details_df['patent_firstnamed_assignee_longitude'], hover_name=patent_details_df['patent_firstnamed_assignee_city'],
                    hover_data=["patent_firstnamed_assignee_id", "patent_firstnamed_assignee_state"])
fig.update_layout(title = 'Exploring the locations of patent assignees - 2018', title_x=0.5)
fig.show()

From the geo map we can see higher presence of inventors and assignees in the mid west and north eastern part of the United States.

Patent Data Visualization using Bokeh¶

Create a bar plot to visualize the top inventor cities with most number of patents in the dataset.

Using groupby the number of patents by inventor city is retrieved and the results are stored in cat_inventor_city_df dataframe

In [31]:
cat_inventor_city = patent_details_df.groupby(['patent_firstnamed_inventor_city'])['patent_number'].count().sort_values(ascending=False).head(10)
cat_inventor_city_df = cat_inventor_city.to_frame()
cat_inventor_city_df.reset_index(inplace = True)
cat_inventor_city_df.rename(columns={"patent_firstnamed_inventor_city": "inventor_city", "patent_number": "no_of_patents"}, inplace = True)
cat_inventor_city_df.set_index(keys = 'inventor_city', inplace = True)
In [32]:
cat_inventor_city_df
Out[32]:
no_of_patents
inventor_city
San Jose 270
San Francisco 212
San Diego 188
Seattle 125
Austin 125
Sunnyvale 99
Houston 94
Palo Alto 92
Cupertino 88
Mountain View 84
In [33]:
import pandas_bokeh
pandas_bokeh.output_notebook()
pd.set_option('plotting.backend', 'pandas_bokeh')
# Create Bokeh-Table with DataFrame:
from bokeh.models.widgets import DataTable, TableColumn
from bokeh.models import ColumnDataSource
cat_inventor_city_df.plot_bokeh(kind="bar",title ="Top 10 Cities with most patents",figsize =(900,400),xlabel = "inventor city",ylabel="no of patents");
Loading BokehJS ...

From the above bar chart, we can see that the city San Jose has most number of patents followed by San Francisco and San Diego.

Bar Charts using Streamlit¶

Bar charts for visualizing the top assignee cities and top inventor cities are created using Streamlit.

Refer patent_barchart.py for addtional details.

Using groupby the number of patents by assignee city is retrieved and the results are stored in cat_assignee_city_df dataframe

In [34]:
cat_assignee_city = patent_details_df.groupby(['patent_firstnamed_assignee_city'])['patent_number'].count().sort_values(ascending=False).head(10)
cat_assignee_city_df = cat_assignee_city.to_frame()
cat_assignee_city_df.reset_index(inplace = True)
cat_assignee_city_df.rename(columns={"patent_firstnamed_assignee_city": "assignee_city", "patent_number": "no_of_patents"}, inplace = True)
cat_assignee_city_df.set_index(keys = 'assignee_city', inplace = True)
In [35]:
st.title('Patent Dashboard')
st.markdown('The dashboard will visualize the patent data retrieved from the patent API')

st.markdown('## **Patent data**')
st.dataframe(patent_details_df)

st.markdown('## **Patents by inventor city**')
st.bar_chart(cat_inventor_city_df)
st.markdown('## **Patents by assignee city**')
st.bar_chart(cat_assignee_city_df)
2022-06-24 20:55:14.297 
  Warning: to view this Streamlit app on a browser, run it with the following
  command:

    streamlit run C:\Users\mbala\anaconda3\envs\aap\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
Out[35]:
DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)

Patent Date Analysis¶

What is the average number of patents invented per day?¶

In [36]:
# Create a Group by object using patent_date
grp_date = patent_details_df.groupby('patent_date')

# Compute number of patents by date and check out the result
patents_by_date = pd.DataFrame(grp_date.size(), columns=['no_of_patents'])

patents_by_date
Out[36]:
no_of_patents
patent_date
2018-06-19 1498
2018-06-26 2210
2018-07-03 2322
2018-07-10 2268
2018-07-17 9
In [37]:
mean_patents = patents_by_date['no_of_patents'].mean()
tot_patent_dates = patents_by_date['no_of_patents'].count()
num_days = (patents_by_date.index.max() - patents_by_date.index.min()).days + 1

print("The mean number of patents per day is {:.2f}. The mean is based on {} patent dates."
      .format(mean_patents, tot_patent_dates))
print("The beginning of the date range is {}.".format(patents_by_date.index.min()))
print("The end of the date range is {}.".format(patents_by_date.index.max()))
print("The are {} days in the date range.".format(num_days))
The mean number of patents per day is 1661.40. The mean is based on 5 patent dates.
The beginning of the date range is 2018-06-19 00:00:00.
The end of the date range is 2018-07-17 00:00:00.
The are 29 days in the date range.

Plotting number of patents by patent_date

In [38]:
patent_details_df_2 = patent_details_df.copy()

patents_details_by_date = patent_details_df_2.groupby('patent_date')['patent_number'].count().sort_values(ascending=False)
pat_det_by_date_df = patents_details_by_date.to_frame()
pat_det_by_date_df.reset_index(inplace = True)
pat_det_by_date_df.rename(columns={"patent_date": "date", "patent_number": "no_of_patents"}, inplace = True)
pat_det_by_date_df
pat_det_by_date_df.set_index('date', inplace = True)
pat_det_by_date_df['no_of_patents'].plot();

From the above plot, we can see a spike in the number of patents in the first two weeks of July. Since the API version returns only 10000 patents, the year is limited to 2018 data only.

In [ ]: